package us.codecraft.webmagic.samples;

import org.apache.commons.collections.map.HashedMap;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.List;
import java.util.Map;

/**
 * @描述:
 * @作者 骆文龙
 * @创建时间 2017-09-28.
 * @版本: v1.0
 */
public class TestProcessor implements PageProcessor {
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
//    private Page page = null;
    private Map<String,Object> dateMap = new HashedMap();
    private int size = 0;
    @Override
    public void process(Page page) {
        List<String> list = page.getHtml().xpath("//tr[@valign='top']/td/a/text()").all();
//        page.addTargetRequests(page.getHtml().links().regex("(http://www.zhishubao.com/rank.php)").all());
//        page.putField("author", page.getUrl().regex("http://www.zhishubao\\.com/(\\w+)/.*").toString());
//        page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
//        System.out.println(page.getResultItems().getAll().toString());
//        if (page.getResultItems().get("name")==null){
//            //skip this page
//            page.setSkip(true);
//        }
//        System.out.println(page.getHtml().xpath("//tr/td[2]/a/text()"));
//        for(int i=1;i<=31;i++){
//            for(int j=1;j<=2;j++){
//                page.putField("cat"+i*j, page.getHtml().xpath("//tr["+i+"]/td["+j+"]/a/text()"));
//                dateMap.put("cat"+i*j,page.getHtml().xpath("//tr["+i+"]/td["+j+"]/a/text()"));
//            }
//        }
//        page.putField("cat", page.getHtml().xpath("//tr/td[2]/a/text()"));
//        page.putField("cat", page.getHtml().xpath("//tr/td[2]/a/text()"));
        System.out.println(list.size());

        for (String str:list){
            if(str.equals("北京")){
                return;
            }
            ++size;
            System.out.println("类别 => "+str);
        }

    }

    @Override
    public Site getSite() {
        return site;
    }

    public Map<String, Object> getDate(){
        return dateMap;
    }


    public static void main(String[] arg0){
        long startTime, endTime;
        TestProcessor test = new TestProcessor();
        System.out.println("【爬虫开始】...");
        startTime = System.currentTimeMillis();
        Spider.create(test).addUrl("http://www.zhishubao.com/rank.php").thread(5).run();

        endTime = System.currentTimeMillis();
        System.out.println("【爬虫结束】共抓取" + test.size + "个类别，耗时约" + ((endTime - startTime) / 1000) + "秒，已保存到数据库，请查收！");
    }


}
